# Import packages
library(tm)
## Loading required package: NLP
library(tidytext)
library(viridis)
## Loading required package: viridisLite
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(stringr)
# Plotting
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
# Co-occurance
library(gutenbergr)
library(stringr)
library(dplyr)
library(tidyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
# Machine learning
library(topicmodels)
library(clValid)
## Loading required package: cluster
##
## Attaching package: 'clValid'
## The following object is masked from 'package:igraph':
##
## clusters
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:igraph':
##
## edges, mst, ring
library(dplyr)
library(fpc)
library(cluster)
# Load full dataset
data <- read.delim('data/videolist_search364_2019_07_18-09_19_04(slime).tab')
str(data)
## 'data.frame': 364 obs. of 22 variables:
## $ position : int 1 2 3 4 5 6 7 8 9 10 ...
## $ channelId : Factor w/ 150 levels "UC_-5N2NYjk4F0cHivItgSuA",..: 108 99 61 130 25 116 13 130 125 52 ...
## $ channelTitle : Factor w/ 150 levels "/부후우BOOWHOWOO",..: 52 47 145 83 43 61 26 83 123 112 ...
## $ videoId : Factor w/ 350 levels "_ON8k6bRj7g",..: 254 150 18 156 16 209 342 281 83 120 ...
## $ publishedAt : Factor w/ 349 levels "2008-03-19T09:43:31.000Z",..: 349 348 344 343 337 330 340 328 336 339 ...
## $ publishedAtSQL : Factor w/ 349 levels "2008-03-19 09:43:31",..: 349 348 344 343 337 330 340 328 336 339 ...
## $ videoTitle : Factor w/ 345 levels "[SLIME CHALLENGES] Mystery Wheel of DUMP IT Slime Challenge!!!",..: 128 4 336 201 112 273 320 200 314 140 ...
## $ videoDescription : Factor w/ 344 levels "","#Slime#Satisfyingslime#Relaxingslime SLIME4KIDS ! MIXING ALL MY SLIME !! SLIME SMOOTHIE SATISFYING SLIME VIDEOS"| __truncated__,..: 72 56 327 209 40 119 70 208 313 167 ...
## $ videoCategoryId : int 26 22 24 24 22 22 24 24 24 22 ...
## $ videoCategoryLabel: Factor w/ 11 levels "Comedy","Education",..: 6 10 3 3 10 10 3 3 3 10 ...
## $ duration : Factor w/ 287 levels "PT0S","PT10M10S",..: 162 67 4 9 219 281 161 26 89 187 ...
## $ durationSec : int 1114 760 612 619 188 541 1110 640 798 1375 ...
## $ dimension : Factor w/ 1 level "2d": 1 1 1 1 1 1 1 1 1 1 ...
## $ definition : Factor w/ 2 levels "hd","sd": 1 1 1 1 1 1 1 1 1 1 ...
## $ caption : Factor w/ 2 levels "false","true": 1 1 1 1 1 1 1 1 1 1 ...
## $ thumbnail_maxres : Factor w/ 330 levels "","https://i.ytimg.com/vi/_ON8k6bRj7g/maxresdefault.jpg",..: 239 140 19 145 17 194 322 265 80 1 ...
## $ licensedContent : int 1 1 1 1 1 1 1 1 1 1 ...
## $ viewCount : int 337863 23412 609243 38728 53638 402098 234485 114358 81071 339161 ...
## $ likeCount : int 16215 1670 7322 1294 2338 6219 16627 2388 1855 24228 ...
## $ dislikeCount : int 473 30 907 68 77 616 474 127 92 924 ...
## $ favoriteCount : int 0 0 0 0 0 0 0 0 0 0 ...
## $ commentCount : int 4967 805 358 99 3789 580 1242 156 365 NA ...
```
### Create text dataframe
textData <- data[c(4, 7, 8)]
text_df <- data.frame(text = paste(textData$videoTitle, textData$videoDescription, sep = " ", collapse = NULL),
doc_id = textData$videoId)
str(text_df)
## 'data.frame': 364 obs. of 2 variables:
## $ text : Factor w/ 350 levels "[SLIME CHALLENGES] Mystery Wheel of DUMP IT Slime Challenge!!! Hey team and amazing beings! We did the mystery "| __truncated__,..: 131 4 341 205 115 277 325 204 319 143 ...
## $ doc_id: Factor w/ 350 levels "_ON8k6bRj7g",..: 254 150 18 156 16 209 342 281 83 120 ...
Filter English language content only and remove technical text (URLs, username tags and channel names)
# Define function for cleaning text dataframes
cleanData <- function (data) {
data %>%
# Remove URLS
mutate(text = str_replace_all(text, "http[^[:space:]]*", "")) %>%
mutate(text = str_replace_all(text, "[^[:space:]]*com", "")) %>%
# Remove @username tags
mutate(text = str_replace_all(text, "@[^[:space:]]*", "")) %>%
# Remove non-alphabetic characters (retaining spaces)
mutate(text = str_replace_all(text, "[^[:alpha:], [:space:]]", "")) %>%
# Remove (common) channel names
mutate(text = str_replace_all(text, "[^[:space:]].slime.[^[:space:]]", "")) %>%
mutate(text = str_replace_all(text, "[^[:space:]]*channel", "")) %>%
# Subset only videos which contain common English language phrases
subset(str_detect(text_df$text, pattern = "the | to | of | is | that ") & !(str_detect(text_df$text, pattern="con | la | en | para"))) %>%
return()
}
# Clean text dataframe
text_df <- cleanData(text_df)
str(text_df)
## 'data.frame': 221 obs. of 2 variables:
## $ text : chr "MAKE THIS SLIME PRETTY CHALLENGE Slimeatory GUYS OUR APP IS FINALLY OUT IM SO EXCITED download it here on the "| __truncated__ " ETSY SLIME VS WISH SLIME Which Is Worth It Download Amino and search my profile name, itskristiii, to check o"| __truncated__ "Strawberry vs Banana Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this make"| __truncated__ "Black vs Rose Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this makeup dest"| __truncated__ ...
## $ doc_id: Factor w/ 350 levels "_ON8k6bRj7g",..: 254 150 209 94 160 95 310 207 189 121 ...
youtube_category_names <- unique(data$videoCategoryLabel)
par(mfrow=c(2,1))
data %>%
count(videoCategoryLabel) %>%
ggplot(aes(videoCategoryLabel, n)) +
geom_col(fill="royalblue4") +
xlab("video category") +
ylab("count") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
theme(axis.title = element_text(size=20),
axis.text = element_text(size=10))
Remove common YouTube and social media terminology and non-thematic common words
# Words to ignore (common YouTube terms, common channel names, social media platforms)
ignoreWords_df <-
(c("slime", "slimes", "watch", "channels", "series", "credits", "see", "watching", "video", "videos", "subscribe", "new", "like",
"instagram", "ig", "facebook", "channel", "dont", "follow", "twitter", "thank", "youtube", "de", "el", "snapchat", "tanyast",
"izabelastress", "troom", "compilation", "special", "check", "follow", "profile", "picture", "tbestsatisfying", "spotify",
"ill", "im", "email", "cmt", "daya", "tepslime", "dx", "karina", "sam", "jerlime", "boenatisfyvideos", "fgteev",
"con", "para", "en", "la"
)) %>%
enframe(value="word")
ignoreWords_cond <-
"slime | watch | channels | series | credits | see | watching | video | videos | subscribe | new | like |
instagram | ig | facebook | channel | dont | follow | twitter | thank | youtube | de | el | snapchat | tanyast |
izabelastress | troom | compilation | special | follow | profile | picture | tbestsatisfying | spotify |
ill | im | email | cmt | daya | tepslime | dx | karina | sam | jerlime | boenatisfyvideos | fgteev | con | para | en | la"
# Define function for cleaning Tidy data
cleanTidy <- function(tidy_data, n) { # Function for cleaning Tidy data format
ifelse ((n == 1),
return(subset(tidy_data, (!word %in% stop_words$word) & (!word %in% ignoreWords_df$word) & str_length(!word < 11))),
return(subset(tidy_data, ((!word1 %in% stop_words$word) & (!word2 %in% stop_words$word) &
(!word1 %in% ignoreWords_df$word) & (!word2 %in% ignoreWords_df$word) &
str_length(word1 < 11) & str_length(word2 < 11)))))
}
# Tokenise text dataframe (Tidy format) and apply cleaning function
textTidy <- text_df %>%
unnest_tokens(word, text) %>%
cleanTidy(1)
head(textTidy)
## doc_id word
## 1.3 RnKKntjHALM pretty
## 1.4 RnKKntjHALM challenge
## 1.5 RnKKntjHALM slimeatory
## 1.6 RnKKntjHALM guys
## 1.8 RnKKntjHALM app
## 1.10 RnKKntjHALM finally
# Count frequency of words accross corpus
textFreq <- textTidy %>%
cleanTidy(1) %>%
count(word, sort = TRUE)
textFreq %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(word, n), y=n)) +
geom_col(fill="#481668") +
xlab('word') +
ylab('count') +
theme(legend.position="none") +
coord_flip()
# Create table of most frequent colour names
colours_df <-
c("pink", "blue", "green", "purple", "red", "yellow", "orange", "black", "silver", "gold", "teal", "white", "brown") %>%
enframe(value="word")
non_colour <- textFreq %>%
filter(!word %in% colours_df$word)
non_colour %>%
top_n(20,n) %>%
ggplot(aes(x = reorder(word, n), y=n)) +
geom_col(fill="#453781") +
xlab('word') +
ylab('count') +
theme(legend.position="none") +
coord_flip()
video_textFreq <- textTidy %>%
cleanTidy(1) %>%
group_by(doc_id) %>%
count(word, sort = TRUE)
four_videos <- sample(video_textFreq$doc_id, 4)
video_textFreq %>%
filter(doc_id %in% four_videos) %>%
group_by(doc_id) %>%
top_n(5, n) %>%
ggplot(aes(x = reorder(word, n), y=n, fill=factor(doc_id))) +
geom_col() +
scale_fill_manual(values=c("#440E57", "#462E7B", "#433E85", "#38588C", "#32648E"), aesthetics = "fill") +
xlab('word') +
ylab('count') +
theme(legend.position="none") +
coord_flip() +
facet_wrap(~doc_id, ncol = 2, scales = "free_y") +
scale_color_viridis(discrete=FALSE)
# Create dataframe for YouTube category data
youtubeCategoryData <- data[c(4, 10)]
colnames(youtubeCategoryData) <- c("doc_id", "video_category")
youtubeCategories <- textTidy %>%
left_join(youtubeCategoryData, by="doc_id")
category_termFreq <- youtubeCategories %>%
group_by(video_category) %>%
count(video_category, word, sort=TRUE)
category_termFreq %>%
group_by(video_category) %>%
top_n(2, n) %>%
ggplot(aes(x=reorder(word,n), y=n, fill=factor(video_category))) +
geom_col(show.legend = FALSE) +
facet_wrap(~video_category, scales = "free", ncol=2, nrow=6) +
scale_fill_manual(values=c("#440E57", "#462E7B", "#462E7B", "#433E85", "#31668E", "#413F86", "#38588C", "#32648E", "#39568C", "#2F6C8E"), aesthetics = "fill") +
xlab("word") +
ylab("count") +
coord_flip()
category_termFreq %>%
subset(video_category == 'Entertainment' | video_category == 'Howto & Style' | video_category == 'People & Blogs') %>%
group_by(video_category) %>%
top_n(10, n) %>%
ggplot(aes(x=reorder(word,n), y=n, fill=factor(video_category))) +
geom_col(show.legend = FALSE) +
facet_wrap(~video_category, scales = "free", ncol=2, nrow=6) +
scale_fill_manual(values=c("#440E57", "#433E85", "#32648E"), aesthetics = "fill") +
xlab("word") +
ylab("count") +
coord_flip()
textTFIDF <- textTidy %>%
count(doc_id, word, sort=TRUE) %>%
bind_tf_idf(word, doc_id, n) %>%
arrange(desc(tf_idf))
textTFIDF %>%
filter(tf_idf > 0.8) %>%
ggplot(aes(x = reorder(word, tf_idf), y=tf_idf)) +
geom_col(fill="#433E85") +
xlab('word') +
ylab('tf-idf') +
theme(legend.position="none") +
coord_flip()
categoryTFIDF <- youtubeCategories %>%
count(video_category, word, sort=TRUE) %>%
bind_tf_idf(word, video_category, n) %>%
arrange(desc(tf_idf))
categoryTFIDF %>%
top_n(15, tf_idf) %>%
ggplot(aes(x = reorder(word, tf_idf), y=tf_idf)) +
geom_col(fill="#433E85") +
xlab('word') +
ylab('tf_idf') +
coord_flip()
Prepare bigrams
# Create tokenized bi-grams (two word pairings)
textBigrams_raw <- text_df %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2) %>%
count(ngram, sort = TRUE)
textBigrams <- textBigrams_raw %>%
separate(ngram, c("word1", "word2"), sep = " ") %>%
cleanTidy(2)
head(textBigrams)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 mixing makeup 91
## 2 makeup eyeshadow 90
## 3 yellow blue 80
## 4 yellow pink 78
## 5 orange purple 77
## 6 green red 74
# Define function for visualising bigrams
visualize_bigrams <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type="closed", length=unit(.1, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout="fr") +
geom_edge_link(aes(edge_alpha=n), show.legend=FALSE, arrow=a) +
geom_node_point(color="#E3CD5B", size=7) +
geom_node_text(aes(label=name), vjust=1, hjust=1, size=5) +
theme_void()
}
textBigrams %>%
subset((word1 != "ronald") & (word2 != "ronald")) %>%
subset(str_length(word1) < 11 & str_length(word2) < 11) %>%
top_n(55, n) %>%
# filter(word1 %in% frequentWords$word | word2 %in% frequentWords$word) %>%
visualize_bigrams()
textBigrams_raw %>%
subset((!str_detect(textBigrams_raw$ngram, pattern="slime"))) %>%
top_n(20, n) %>%
ggplot(aes(x=reorder(ngram, n), y=n)) +
geom_col(fill="#E3CD5B") +
xlab('word pairing') +
ylab('frequency') +
coord_flip()
text_trigrams <- text_df %>%
unnest_tokens(ngram, text, token = "ngrams", n = 3)
text_trigrams %>%
subset(!str_detect(text_trigrams$ngram, pattern = "slime")) %>%
count(ngram, sort = TRUE) %>%
top_n(20, n) %>%
ggplot(aes(x=reorder(ngram, n), y=n)) +
geom_col(fill="#D2C066") +
xlab('phrase') +
ylab('frequency') +
coord_flip()
# Explore count distribution
top_freq <- subset(video_textFreq, video_textFreq$n >= 2) # Subset on words which occur at least twice
top_freq %>%
ggplot(aes(y=n)) +
geom_boxplot(fill="#22928C")
Cap distribution at 98th percentile to reduce variance
# Replace counts over 98th percentile with 98th percentile value
top_freq$n[top_freq$n > quantile(video_textFreq$n, probs=c(.98), na.rm = T)] <- quantile(video_textFreq$n, probs=c(.98), na.rm = T)
# Re-check distribution
top_freq %>%
ggplot(aes(y=n)) +
geom_boxplot(fill="#22928C")
Normalise count variable (‘n’) by video
# Define normalise function
normalise <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# Group by video and normalise within these grouped ranges
topFreq_prepped <- top_freq %>%
group_by(doc_id) %>%
mutate(n = normalise(n)) %>%
mutate(n = ifelse(is.na(n), 0, n)) %>%
ungroup()
# Check new distribution
topFreq_prepped %>%
ggplot(aes(y=n)) +
geom_boxplot(fill="#22928C")
Format data into dissimilarity matrix
# Cast normalised data to DTM (Document Term Matrix)
video_DTMnormal <- topFreq_prepped %>%
cast_dtm(doc_id, word, n)
# Reduce sparsity of DTM
video_DTMSnormal <- removeSparseTerms(video_DTMnormal, 0.8)
# Create dissimilarity matrix
docsdissimNorm <- dist(video_DTMnormal, method="euclidean")
str(docsdissimNorm)
## 'dist' num [1:61075] 0 0 1.41 0 0 ...
## - attr(*, "Size")= int 350
## - attr(*, "Labels")= chr [1:350] "_ON8k6bRj7g" "_ROBKShzx-c" "_YLgQcYMghI" "_zXwDXa4LTo" ...
## - attr(*, "Diag")= logi FALSE
## - attr(*, "Upper")= logi FALSE
## - attr(*, "method")= chr "euclidean"
## - attr(*, "call")= language dist(x = video_DTMnormal, method = "euclidean")
Determine optimal clustering model and input
# Fit clValid method based on hierarchical and K-means clustering
clmethods <- c("hierarchical","kmeans")
internalValidationNorm <- clValid(as.matrix(docsdissimNorm), nClust = 2:10,
clMethods = clmethods, validation = "internal")
optimalScores(internalValidationNorm)
## Score Method Clusters
## Connectivity 3.3996032 kmeans 2
## Dunn 0.3668525 hierarchical 10
## Silhouette 0.8160626 hierarchical 5
Plot internal validation results
# Create graphical matrix
par(mfrow=c(2,2),mar=c(4,4,3,1))
par(xpd=TRUE)
# Plot internal validation results
plot(internalValidationNorm, legend=FALSE, pch=c(2, 3), main="")
plot(nClusters(internalValidationNorm), measures(internalValidationNorm,"Dunn")[,,1], type="n",axes=F, xlab="", ylab="", pch=c(2, 3))
legend("center", clusterMethods(internalValidationNorm), col=1:9, lty=1:9, pch=c(2, 3))
# Reset graphical parameters
op <- par(no.readonly=TRUE)
par(op)
Evaluate clustering parameters
# Perform hierarchical clustering using three linkage methods
h_avg <- hclust(docsdissimNorm, method = "average") %>%
as.dendrogram()
h_comp <- hclust(docsdissimNorm, method = "complete") %>%
as.dendrogram()
h_sing <- hclust(docsdissimNorm, method = "single") %>%
as.dendrogram()
par(mfrow=c(2,1))
# Define sub-plots for matrix
# Sub-plot for average method results
# Whole tree
plot(h_avg, leaflab = "none", xlab="videos", ylab="tree height", main="Average linkage dendogram", xlim = c(1, 180))
abline(h = 2.9, col="#481668", lty=2)
text(150, 2.45, "5 branches at 2.9", col="#481668", cex = 1)
# Zoomed in on 5 branch point
plot(h_avg, xlim = c(1, 35), ylim = c(1.9,3.7), leaflab = "none", xlab="videos", ylab="tree height", main="Zoomed at 5 branches: Average linkage")
abline(h = 2.9, col="#481668", lty=2)
text(25, 3.2, "5 branches at 2.9", col="#481668", cex = 1)
par(mfrow=c(2,1))
# Sub-plot for complete method results
# Whole tree
plot(h_comp, leaflab = "none", xlab="videos", ylab="tree height", main="Complete linkage dendogram", xlim = c(1, 400))
abline(h = 3.7, col="#481668", lty=2)
text(270, 3, "5 branches at 3.7", col="#481668", cex = 1)
# Zoomed in on 5 branch point
plot(h_comp, xlim = c(1, 400), ylim = c(2.00,4.3), leaflab = "none", xlab="videos", ylab="tree height", main="Zoomed at 5 branches: Complete linkage")
abline(h = 3.7, col="#481668", lty=2)
text(200, 3.3, "5 branches at 3.7", col="#481668", cex = 1)
# Whole tree
plot(h_sing, xlab="videos", ylab="tree height", main="Single linkage dendogram", xlim = c(1, 180), leaflab = "none")
abline(h = 2.4, col="#481668", lty=2)
text(150, 2.25, "5 branches at 2.4", col="#481668", cex = 1)
# Zoomed in on 5 branch
plot(h_sing, xlim = c(1, 15), leaflab = "none", xlab="videos", ylab="tree height", main="Zoomed at 5 branches: Single linkage", nodePar = (pch = c(19, 17, col="#27AB82")))
abline(h = 2.4, col="#481668", lty=2)
text(12, 2.5, "5 branches at 2.4", col="#481668", cex = 1)
Perform clustering and add results to text dataframe
# Cut the tree into 5 clusters using single
h_clust5 <- hclust(docsdissimNorm, method = "single") %>%
cutree(k=5)
# Add clusters to text dataframe
# Turn cluster results into data frame with appropriate column names
h_clust5 <- data.frame(h_clust5)
h_clust5$doc_id <- rownames(h_clust5)
colnames(h_clust5) <- c("h_cluster", "doc_id")
str(h_clust5)
## 'data.frame': 350 obs. of 2 variables:
## $ h_cluster: int 1 1 1 1 1 1 1 1 1 1 ...
## $ doc_id : chr "_ON8k6bRj7g" "_ROBKShzx-c" "_YLgQcYMghI" "_zXwDXa4LTo" ...
# Create dataframe to hold text and cluster data
text_categories <- left_join(text_df, h_clust5, by='doc_id')
## Warning: Column `doc_id` joining factor and character vector, coercing into
## character vector
str(text_categories)
## 'data.frame': 221 obs. of 3 variables:
## $ text : chr "MAKE THIS SLIME PRETTY CHALLENGE Slimeatory GUYS OUR APP IS FINALLY OUT IM SO EXCITED download it here on the "| __truncated__ " ETSY SLIME VS WISH SLIME Which Is Worth It Download Amino and search my profile name, itskristiii, to check o"| __truncated__ "Strawberry vs Banana Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this make"| __truncated__ "Black vs Rose Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this makeup dest"| __truncated__ ...
## $ doc_id : chr "RnKKntjHALM" "hHiNK-W488E" "n2Sh4hE8vLs" "cmos4XxA6GA" ...
## $ h_cluster: int 1 1 1 1 1 1 1 1 1 1 ...
HCluster_freq <- text_categories %>%
# cleanTidy(1) %>%
unnest_tokens(word, text) %>%
subset(!(word %in% stop_words$word)) %>%
subset(!(word %in% ignoreWords_df$word)) %>%
count(h_cluster, word)
HCluster_freq %>%
group_by(h_cluster) %>%
top_n(10, n) %>%
ggplot(aes(reorder(word, n), n, fill = factor(h_cluster))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ h_cluster, scales = "free") +
xlab("word") +
ylab("frequency") +
coord_flip()
# Define for plotting K for a given range
plotK <- function(begin, finish) {
par(op)
par(mfrow=c(2,2))
for (i in begin:finish) {
title <- paste0("Clusters when k=", toString(i))
kfit <- kmeans(docsdissimNorm, i)
clusplot(as.matrix(docsdissimNorm), kfit$cluster, color=T, shade=T, lines=0, main=title)
}
par(op)
}
# Plot clustering with K between 2 and 5
plotK(3, 6)
par(op)
par(mfrow=c(2,2))
for (i in 1:4) {
kfit5 <- kmeans(docsdissimNorm, 5)
clusplot(as.matrix(docsdissimNorm), kfit5$cluster, color=T, shade=T, lines=0, main="k=5")
}
par(op)
par(mfrow=c(1,1))
kfit5 <- kmeans(docsdissimNorm, 5)
clusplot(as.matrix(docsdissimNorm), kfit5$cluster, color=T, shade=T, lines=0, main="k=5")
Perform clustering and add results to text dataframe
Kclusters_5 <- as.data.frame(kfit5$cluster)
Kclusters_5$doc_id <- rownames(Kclusters_5)
colnames(Kclusters_5) <- c("k_cluster", "doc_id")
str(Kclusters_5)
## 'data.frame': 350 obs. of 2 variables:
## $ k_cluster: int 2 2 2 5 2 2 2 2 2 2 ...
## $ doc_id : chr "_ON8k6bRj7g" "_ROBKShzx-c" "_YLgQcYMghI" "_zXwDXa4LTo" ...
text_categories <- left_join(text_categories, Kclusters_5, by="doc_id")
str(text_categories)
## 'data.frame': 221 obs. of 4 variables:
## $ text : chr "MAKE THIS SLIME PRETTY CHALLENGE Slimeatory GUYS OUR APP IS FINALLY OUT IM SO EXCITED download it here on the "| __truncated__ " ETSY SLIME VS WISH SLIME Which Is Worth It Download Amino and search my profile name, itskristiii, to check o"| __truncated__ "Strawberry vs Banana Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this make"| __truncated__ "Black vs Rose Mixing Makeup Eyeshadow Into Slime Special Series Satisfying Slime Video Hi In this makeup dest"| __truncated__ ...
## $ doc_id : chr "RnKKntjHALM" "hHiNK-W488E" "n2Sh4hE8vLs" "cmos4XxA6GA" ...
## $ h_cluster: int 1 1 1 1 1 1 1 1 1 1 ...
## $ k_cluster: int 5 3 1 1 5 4 1 5 1 3 ...
# Transform to Tidy format
kCluster_freq <- text_categories %>%
unnest_tokens(word, text) %>%
cleanTidy(1) %>%
count(k_cluster, word)
# Plot frequent terms
kCluster_freq %>%
group_by(k_cluster) %>%
top_n(10, n) %>%
ggplot(aes(reorder(word, n), n, fill = factor(k_cluster))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ k_cluster, scales = "free") +
xlab("word") +
ylab("frequency") +
coord_flip()
# Cast tidy dataframe to DTM
video_DTM <- video_textFreq %>%
cleanTidy(1) %>%
cast_dtm(doc_id, word, n)
# Remove documents with no repeated words
raw.sum <- apply(video_DTM, 1,FUN=sum)
topic_DTM <- video_DTM[raw.sum!=0,]
str(topic_DTM)
## List of 6
## $ i : int [1:9306] 30 81 141 180 5 6 13 18 19 26 ...
## $ j : int [1:9306] 1 1 1 1 2 2 2 2 2 2 ...
## $ v : num [1:9306] 34 1 1 1 10 20 17 17 16 17 ...
## $ nrow : int 213
## $ ncol : int 3368
## $ dimnames:List of 2
## ..$ Docs : chr [1:213] "_zXwDXa4LTo" "-OH89gNc2X0" "0K1WZUfvE54" "0tXKZr7tQ7o" ...
## ..$ Terms: chr [1:3368] "playlist" "pink" "blue" "roblox" ...
## - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
# Function for fitting topic model and plotting top n words
plotTopics <- function (data, topic_n, term_n) {
topics <- lda <- LDA(data, k = topic_n, control = list(seed = 1234))
video_topics <- tidy(topics, matrix = "beta")
video_topics %>%
group_by(topic) %>%
top_n(term_n, beta) %>%
ggplot(aes(reorder(term, beta), beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
xlab("word") +
ylab("probability (beta)") +
coord_flip()
}
topic_DTM %>% plotTopics(3, 10)
topic_DTM %>% plotTopics(4, 10)
topic_DTM %>% plotTopics(5, 10)
topic_DTM %>% plotTopics(6, 10)
topic_DTM %>% plotTopics(7, 10)
# Perform topic analysis based on 5 topics
topics <- lda <- LDA(topic_DTM, 6, control = list(seed = 1234))
# Tidy
video_topics <- tidy(topics, matrix = "beta")
library(tidyr)
beta_diff <- video_topics %>%
subset(topic == 1 | topic == 4) %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > 0.007 | topic4 > 0.007) %>%
mutate(log_ratio = log2(topic1 / topic4))
beta_diff %>%
ggplot(aes(reorder(term, log_ratio), log_ratio, fill=(log_ratio))) +
geom_col(show.legend = FALSE) +
scale_fill_viridis(option="cividis") +
xlab("term") +
ylab("log ratio topic1/topic4") +
coord_flip()
library(tidyr)
beta_diff <- video_topics %>%
subset(topic == 5 | topic == 6) %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic5 > 0.007 | topic6 > 0.007) %>%
mutate(log_ratio = log2(topic6 / topic5))
beta_diff %>%
ggplot(aes(reorder(term, log_ratio), log_ratio, fill=(log_ratio))) +
geom_col(show.legend = FALSE) +
scale_fill_viridis(option="cividis") +
xlab("term") +
ylab("log ratio") +
coord_flip()
Create dataframe for engagement variables
# Target views data
views_df <- data[c("videoId", "videoCategoryLabel", "viewCount", "likeCount", "commentCount", "dislikeCount")]
views_df$doc_id <- views_df$videoId
# Replace NAs with mean
views_df[which(is.na(views_df$viewCount)), "viewCount"] <- mean(views_df$viewCount)
# Explore distribution
views_df %>%
ggplot(aes(y= viewCount)) +
geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
Cap distribution at 95hth percentile to reduce variance
# Find 95th quantile
qnt <- quantile(views_df$viewCount, probs=c(.95), na.rm = T)
# Cap values above 95th quantile with 95th quantile value
views_df$viewCount[views_df$viewCount > qnt] <- qnt
# Rexplore distribution
views_df %>%
ggplot(aes(y= viewCount)) +
geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
Join with word frequency data
words_views <- left_join(video_textFreq, views_df, by='doc_id')
head(words_views)
## # A tibble: 6 x 9
## # Groups: doc_id [4]
## doc_id word n videoId videoCategoryLa… viewCount likeCount
## <fct> <chr> <int> <fct> <fct> <dbl> <int>
## 1 7ekjJ… play… 34 7ekjJI… Howto & Style 5808 30
## 2 Gt3Zh… pink 34 Gt3Zhg… People & Blogs 974456 11703
## 3 Gt3Zh… pink 34 Gt3Zhg… People & Blogs 974456 11703
## 4 JJpAa… pink 34 JJpAaO… People & Blogs 2815302 25287
## 5 JJpAa… pink 34 JJpAaO… People & Blogs 2815302 25287
## 6 nbvjN… pink 34 nbvjNm… People & Blogs 609265 7011
## # … with 2 more variables: commentCount <int>, dislikeCount <int>
words_views %>%
subset(word =='makeup' | word=='challenge' | word=='mixing' | word== 'satisfying' | word == 'asmr' | word == 'diy') %>%
ggplot(aes(n, viewCount, color=word, size=likeCount)) +
scale_color_viridis(option="cividis", discrete=TRUE) +
geom_jitter()
# Plot distribution within 1st-98th quantiles
data %>%
subset(viewCount < quantile(viewCount, probs=c(.97), na.rm = T)) %>%
ggplot(aes(videoCategoryLabel, viewCount)) +
geom_boxplot(fill="royalblue4") +
xlab("video category") +
ylab("video views") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
theme(axis.title = element_text(size=20),
axis.text = element_text(size=15),
legend.position="none")